# Let's import the boilerplate code. The main
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib as mpl
import mpl_toolkits
from datetime import datetime
import geopandas
import folium
import mpl_toolkits
import nltk
import textblob
pd.set_option('display.max_columns', None)
data_folder='../../data/airbnb/'
#files: calendar.csv.gz listings_summary.csv reviews.csv.gz
# listings.csv neighbourhoods.csv reviews_summary.csv
# listings.csv.gz neighbourhoods.geojson
data_calendar = pd.read_csv(data_folder+"calendar.csv")
data_listings_summary=pd.read_csv(data_folder+"listings_summary.csv")
data_reviews = pd.read_csv(data_folder+"reviews.csv.gz")
data_listings = pd.read_csv(data_folder+"listings.csv",low_memory=False, parse_dates=True)
data_neighbourhoods= pd.read_csv(data_folder+"neighbourhoods.csv", low_memory=False)
data_review_summary=pd.read_csv(data_folder+"reviews_summary.csv")
data_neighbourhoods_geo=pd.read_json(data_folder+"neighbourhoods.geojson")
data_listings.drop(data_listings[data_listings.country_code!="GB"].index.values,axis=0,inplace=True)
#the price variable is in the summary file
data_listings['price2'] = data_listings['id'].map(data_listings_summary.set_index('id')['price'])
#Examine the null data and missing values
data_listings.isnull().sum()[data_listings.isnull().sum()>0].plot(kind='barh',figsize=(12,10));
data_listings.drop(data_listings[data_listings.beds==0].index.values,axis=0, inplace=True)
data_listings.drop(data_listings[data_listings.beds.isnull()].index.values,axis=0, inplace=True)
data_listings.drop(data_listings[data_listings.review_scores_rating.isnull()].index.values,axis=0, inplace=True)
data_listings.drop(data_listings[data_listings.review_scores_cleanliness.isnull()].index.values,axis=0, inplace=True)
data_listings.drop(data_listings[data_listings.review_scores_location.isnull()].index.values,axis=0, inplace=True)
data_listings.drop(data_listings[data_listings.host_is_superhost.isnull()].index.values,axis=0, inplace=True)
data_listings.drop(data_listings[data_listings.host_has_profile_pic.isnull()].index.values,axis=0, inplace=True)
data_listings.drop(data_listings[data_listings.property_type=="Parking Space"].index.values,axis=0, inplace=True)
data_listings.drop(data_listings[data_listings.price2==10000].index.values,inplace=True)
data_listings.drop(data_listings[data_listings.price2>250].index.values,inplace=True)
data_listings.drop(data_listings[data_listings.price2==0].index.values,inplace=True)
data_listings.drop(data_listings[data_listings.reviews_per_month.isnull()].index.values,axis=0, inplace=True)
#Let's trim unnecessary columns like url to streamline our data
data_listings.drop(["scrape_id","listing_url","interaction","notes","thumbnail_url","medium_url","picture_url","host_url","host_name","host_response_time","host_response_rate","host_acceptance_rate","host_thumbnail_url","host_picture_url","host_listings_count","host_verifications","host_identity_verified"],axis=1,inplace=True)
data_listings.drop(["first_review","license","jurisdiction_names","requires_license","require_guest_phone_verification"],axis=1,inplace=True)
data_listings.drop(["xl_picture_url","host_location","market","smart_location","country_code","country","is_location_exact"],axis=1,inplace=True)
data_listings.drop(["host_neighbourhood","street","square_feet","weekly_price","monthly_price","security_deposit","cleaning_fee","guests_included","review_scores_accuracy","review_scores_checkin","review_scores_communication","review_scores_value"],axis=1,inplace=True)
data_listings.drop(["neighbourhood_group_cleansed","minimum_nights_avg_ntm","maximum_nights_avg_ntm","calendar_updated","calculated_host_listings_count","calculated_host_listings_count_entire_homes","calculated_host_listings_count_private_rooms","calculated_host_listings_count_shared_rooms"],axis=1,inplace=True)
data_listings.drop(["transit","zipcode","city","state"],axis=1,inplace=True)
data_listings.drop(['maximum_nights','minimum_minimum_nights', 'maximum_minimum_nights','minimum_maximum_nights', 'maximum_maximum_nights'],axis=1,inplace=True)
data_listings.drop(["bathrooms","bedrooms","neighbourhood",'availability_30', 'availability_60', 'availability_90',
'availability_365', 'calendar_last_scraped',"is_business_travel_ready"],axis=1,inplace=True)
data_listings.drop(["has_availability","extra_people"],axis=1,inplace=True)
data_listings.drop(["number_of_reviews_ltm"],axis=1,inplace=True)
#Let's visualise now:
#Examine the null data and missing values
data_listings.isnull().sum()[data_listings.isnull().sum()>0].plot(kind='bar',figsize=(12,10));
#Let's review our final data
data_listings.describe()
data_listings['price_per_guest']=data_listings['price2']/data_listings['accommodates']
from scipy import stats
res = stats.probplot(data_listings['price_per_guest'], plot=mpl.pyplot)
data_listings.price_per_guest.hist(bins=30,figsize=(5,4));
## Let's check for skew and kurtosis
print("Skewness before log transform: ", data_listings.price_per_guest.skew())
print("Kurtosis before log transform: ",data_listings.price_per_guest.kurt())
Y = np.log1p(data_listings.price_per_guest)
print("Skewness before log transform: ", Y.skew())
print("Kurtosis before log transform: ",Y.kurt())
#Let's visulaise the histogram
Y.hist(bins=30,figsize=(5,4));
#Look's much better, let's normalise
data_listings.price_per_guest = np.log1p(data_listings.price_per_guest)
res = stats.probplot(data_listings['price_per_guest'], plot=mpl.pyplot)
# Let's start with some basic visualisations
num=data_listings.select_dtypes(exclude='object')
numcorr=num.corr()
f,ax=mpl.pyplot.subplots(figsize=(17,3))
sns.heatmap(numcorr.sort_values(by=['price_per_guest'], ascending=False).head(1), cmap='Blues')
mpl.pyplot.title(" Numerical features correlation with the sale price", weight='bold', fontsize=18)
mpl.pyplot.xticks()
mpl.pyplot.yticks(weight='bold', color='dodgerblue', rotation=0)
mpl.pyplot.show()
#Let's also check for multi-collinearity visually with pair plots
sns.pairplot(data_listings[['neighbourhood_cleansed', 'latitude', 'longitude', 'property_type',
'room_type', 'beds', 'bed_type', 'amenities', 'price',
'minimum_nights', 'review_scores_rating', 'review_scores_cleanliness',
'review_scores_location','reviews_per_month',
'price_per_guest']])
#zonation data
Zone1=["City of London", "Camden", "Hackney", "Islington", "Kensington and Chelsea", "Southwark", "Westminster"]
Zone2=["Greenwich", "Hackney", "Hammersmith and Fulham", "Lewisham", "Tower Hamlets", "Wandsworth","Lambeth"]
Zone3=["Barnet", "Brent", "Bromley", "Croydon", "Ealing","Hounslow", "Newham", "Richmond upon Thames", "Waltham Forest"]
Zone4=["Barking and Dagenham", "Bexley","Enfield", "Haringey","Merton", "Newham", "Redbridge", "Sutton","Kingston upon Thames"]
Zone5=["Harrow", "Hillingdon","Havering"]
#count=1
zones= [Zone1,Zone2,Zone3,Zone4,Zone5]
mapping=dict()
for bo in data_listings.neighbourhood_cleansed.unique():
if bo in (Zone1 + Zone2 + Zone3 + Zone4 + Zone5):
for count,zns in enumerate(zones):
if bo in zns:
mapping[bo]="Zone" + str(count+1)
else:
print(bo)
def zonify(neighbourhood_cleansed):
return(mapping.get(neighbourhood_cleansed))
data_listings["Fare_Zone_Rough"]=data_listings['neighbourhood_cleansed'].apply(zonify)
sns.violinplot(x="Fare_Zone_Rough",y="price_per_guest",data=data_listings, figsize=(12,10),c='viridis')
def generateBaseMap(default_location=[51.4975, 0.0007], default_zoom_start=12):
base_map = folium.Map(location=default_location, control_scale=True, zoom_start=default_zoom_start)
return base_map
from folium.plugins import HeatMap
from folium.plugins import Fullscreen
df_copy = data_listings[:1000].copy()
#Let's get a weighting for the heatmap
data_listings['pppg_prop']=data_listings['price_per_guest']/data_listings.price_per_guest.max()
base_map = generateBaseMap()
HeatMap(data=data_listings[['latitude', 'longitude','pppg_prop']][:30000], radius=8, max_zoom=13,blur=15).add_to(base_map);
base_map
base_map.add_child(Fullscreen(position='topleft', title='Full Screen', title_cancel='Exit Full Screen', force_separate_button=False))
for field in ['require_guest_profile_picture','instant_bookable','host_is_superhost','host_has_profile_pic']:
data_listings[field]=data_listings[field].map({'t':(field +' true'),'f':(field +' false')})
cates=data_listings[['room_type','bed_type','Fare_Zone_Rough','property_type','cancellation_policy','require_guest_profile_picture','instant_bookable','host_is_superhost','host_has_profile_pic']]
data_listings=pd.concat([data_listings,pd.get_dummies(data=cates, drop_first=True)], axis=1)
data_listings.head()
# drop the original variables
#data_listings.drop(['Fare_Zone_Rough','property_type','cancellation_policy','require_guest_profile_picture','instant_bookable','host_is_superhost','host_has_profile_pic'],axis=1,inplace=True)
data_listings.isnull().sum()[data_listings.isnull().sum()>0]
X=data_listings[['beds','number_of_reviews',
'review_scores_rating', 'review_scores_cleanliness',
'review_scores_location', 'reviews_per_month','Fare_Zone_Rough_Zone3', 'Fare_Zone_Rough_Zone4',
'Fare_Zone_Rough_Zone5', 'property_type_Apartment',
'property_type_Barn', 'property_type_Bed and breakfast',
'property_type_Boat', 'property_type_Boutique hotel',
'property_type_Bungalow', 'property_type_Cabin',
'property_type_Camper/RV', 'property_type_Casa particular (Cuba)',
'property_type_Chalet', 'property_type_Condominium',
'property_type_Cottage', 'property_type_Earth house',
'property_type_Farm stay', 'property_type_Guest suite',
'property_type_Guesthouse', 'property_type_Hostel',
'property_type_Hotel', 'property_type_House', 'property_type_Houseboat',
'property_type_Hut', 'property_type_Island', 'property_type_Lighthouse',
'property_type_Loft', 'property_type_Minsu (Taiwan)',
'property_type_Nature lodge', 'property_type_Other',
'property_type_Ryokan (Japan)', 'property_type_Serviced apartment',
'property_type_Tent', 'property_type_Tiny house',
'property_type_Townhouse', 'property_type_Treehouse',
'property_type_Villa', 'property_type_Yurt',
'require_guest_profile_picture_require_guest_profile_picture true',
'instant_bookable_instant_bookable true',
'host_is_superhost_host_is_superhost true',
'host_has_profile_pic_host_has_profile_pic true','bed_type_Couch', 'bed_type_Futon',
'bed_type_Pull-out Sofa', 'bed_type_Real Bed','room_type_Hotel room', 'room_type_Private room',
'room_type_Shared room']]
Y=data_listings.price_per_guest
from sklearn.model_selection import train_test_split
X_train_cluster, X_test_cluster, Y_train_cluster, Y_test_cluster = train_test_split(X, Y, test_size = .20, random_state = 40)
from sklearn import metrics
from sklearn.cluster import KMeans
inertias = []
ss = []
for k in range(2,11):
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=300, n_init=10, random_state=42)
kmeans.fit(X_train_cluster)
inertias.append(kmeans.inertia_)
ss.append(metrics.silhouette_score(X_train_cluster, kmeans.labels_, metric='euclidean'))
mpl.pyplot.plot(range(2, 11), inertias,label='inertia')
mpl.pyplot.title('Choosing the number of clusters')
mpl.pyplot.xlabel('Number of clusters')
mpl.pyplot.ylabel('Inertia')
mpl.pyplot.plot(range(2, 11), ss,label='silhouete score')
mpl.pyplot.title('Choosing the number of clusters')
mpl.pyplot.xlabel('Number of clusters')
mpl.pyplot.ylabel('Silhouette Score')
mpl.pyplot.legend()
mpl.pyplot.show()
kmeans = KMeans(n_clusters=4, init='k-means++', max_iter=300, n_init=10, random_state=42)
kmeans.fit_predict(X)
data_listings['clustered']=kmeans.labels_
data_listings.clustered.head()
# With multivariate clustering, it can be hard to visualise the clusters, but we can see if some natural segmentation occurs element-wise.
mpl.pyplot.scatter(data_listings.number_of_reviews,data_listings.price_per_guest,c=data_listings.clustered,label="clusters")
mpl.pyplot.legend(loc='upper right')
mpl.pyplot.xlabel('Number of Reviews')
mpl.pyplot.ylabel('Price per guest')
data_listings.clustered.hist(bins=30);
#with clustered data
sns.pairplot(data_listings[['neighbourhood_cleansed', 'latitude', 'longitude',
'room_type', 'beds', 'bed_type', 'amenities', 'review_scores_rating', 'review_scores_cleanliness',
'review_scores_location', 'reviews_per_month',
'price_per_guest','clustered']], hue='clustered')
X=data_listings[['beds','number_of_reviews',
'review_scores_rating', 'review_scores_cleanliness',
'review_scores_location', 'reviews_per_month','Fare_Zone_Rough_Zone3', 'Fare_Zone_Rough_Zone4',
'Fare_Zone_Rough_Zone5', 'property_type_Apartment',
'property_type_Barn', 'property_type_Bed and breakfast',
'property_type_Boat', 'property_type_Boutique hotel',
'property_type_Bungalow', 'property_type_Cabin',
'property_type_Camper/RV', 'property_type_Casa particular (Cuba)',
'property_type_Chalet', 'property_type_Condominium',
'property_type_Cottage', 'property_type_Earth house',
'property_type_Farm stay', 'property_type_Guest suite',
'property_type_Guesthouse', 'property_type_Hostel',
'property_type_Hotel', 'property_type_House', 'property_type_Houseboat',
'property_type_Hut', 'property_type_Island', 'property_type_Lighthouse',
'property_type_Loft', 'property_type_Minsu (Taiwan)',
'property_type_Nature lodge', 'property_type_Other',
'property_type_Ryokan (Japan)', 'property_type_Serviced apartment',
'property_type_Tent', 'property_type_Tiny house',
'property_type_Townhouse', 'property_type_Treehouse',
'property_type_Villa', 'property_type_Yurt',
'require_guest_profile_picture_require_guest_profile_picture true',
'instant_bookable_instant_bookable true',
'host_is_superhost_host_is_superhost true',
'host_has_profile_pic_host_has_profile_pic true','bed_type_Couch', 'bed_type_Futon',
'bed_type_Pull-out Sofa', 'bed_type_Real Bed','room_type_Hotel room', 'room_type_Private room',
'room_type_Shared room','clustered']]
#split the data into simple train_test_split
Y=data_listings.price_per_guest
from sklearn.model_selection import train_test_split
X_train_prel, X_test_prel, Y_train_prel, Y_test_prel = train_test_split(X, Y, test_size = .20, random_state = 40)
#try ols
import statsmodels.api as sm
X = sm.add_constant(X_train_prel)
model = sm.OLS(Y_train_prel, X)
results = model.fit()
results.summary()
# At significant level of 0.05:
#remove property type barn, property type bed and breakfast, property_type_boat, property type boutique hotesl ,property_type apartment,propoerty type bungalow,propoerty type cabin, property type camper,propoerty type casa, porperty tyep chalet,
#remove porperty condo, property type cottage, porperty type earth, propoerty type farm stay, propoerty type guest suite, property type guest house, propoerty type hotel, propoerty type house, property type house boat, hut, island
#lighthouse, loft, minsu.nature lodge, other ryokan, service apartment, propoerty type tent, propoerty tiny house
#real bed, pull out sodfa, futon, couch, host profile pic, requires guest profile, yurt,villa
X=data_listings[['beds','number_of_reviews',
'review_scores_rating', 'review_scores_cleanliness',
'review_scores_location', 'reviews_per_month','Fare_Zone_Rough_Zone3', 'Fare_Zone_Rough_Zone4',
'Fare_Zone_Rough_Zone5', 'property_type_Apartment',
'property_type_Barn', 'property_type_Bed and breakfast',
'property_type_Boat', 'property_type_Boutique hotel',
'property_type_Bungalow', 'property_type_Cabin',
'property_type_Camper/RV', 'property_type_Condominium',
'property_type_Cottage', 'property_type_Earth house', 'property_type_Guest suite',
'property_type_Guesthouse', 'property_type_Hostel',
'property_type_Hotel', 'property_type_House', 'property_type_Houseboat',
'property_type_Hut', 'property_type_Lighthouse',
'property_type_Loft', 'property_type_Other',
'property_type_Serviced apartment', 'property_type_Tiny house', 'property_type_Treehouse',
'property_type_Villa',
'require_guest_profile_picture_require_guest_profile_picture true',
'instant_bookable_instant_bookable true',
'host_is_superhost_host_is_superhost true',
'host_has_profile_pic_host_has_profile_pic true', 'room_type_Private room',
'room_type_Shared room','clustered']]
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB # Naive Bayes
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from textblob import TextBlob, Word
import nltk
%matplotlib inline
# Early attempts showed we had a series of non-english characters - we will simply replace them stringwise
text_fields=['amenities','summary','space', 'description', 'experiences_offered', 'neighborhood_overview']
for field in text_fields:
data_listings[field]= data_listings[field].str.replace(r'[^\x00-\x7F]+', '')
data_listings['text_all']=data_listings['name']+" "+data_listings['summary']+' '+data_listings['amenities']+" "+data_listings['neighborhood_overview']
#drop where text_all is null
data_listings.drop(data_listings[data_listings.text_all.isnull()].index.values,axis=0, inplace=True)
X=data_listings[['beds','number_of_reviews',
'review_scores_rating', 'review_scores_cleanliness',
'review_scores_location', 'reviews_per_month','Fare_Zone_Rough_Zone3', 'Fare_Zone_Rough_Zone4',
'Fare_Zone_Rough_Zone5', 'property_type_Apartment',
'property_type_Barn', 'property_type_Bed and breakfast',
'property_type_Boat', 'property_type_Boutique hotel',
'property_type_Bungalow', 'property_type_Cabin',
'property_type_Camper/RV', 'property_type_Condominium',
'property_type_Cottage', 'property_type_Earth house', 'property_type_Guest suite',
'property_type_Guesthouse', 'property_type_Hostel',
'property_type_Hotel', 'property_type_House', 'property_type_Houseboat',
'property_type_Hut', 'property_type_Lighthouse',
'property_type_Loft', 'property_type_Other',
'property_type_Serviced apartment', 'property_type_Tiny house', 'property_type_Treehouse',
'property_type_Villa',
'require_guest_profile_picture_require_guest_profile_picture true',
'instant_bookable_instant_bookable true',
'host_is_superhost_host_is_superhost true',
'host_has_profile_pic_host_has_profile_pic true', 'room_type_Private room',
'room_type_Shared room','clustered','text_all']]
y=data_listings.price_per_guest
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
# text_string=data_listings.text_all.str.cat(sep='')
# from PIL import Image
# from wordcloud import WordCloud
# wordcloud = WordCloud(width = 8000, height = 8000,
# background_color ='white',
# stopwords = stop,
# min_font_size = 10).generate(text_string)
# mpl.pyplot.figure(figsize=(15,8))
# mpl.pyplot.imshow(wordcloud)
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')
stop.append('or')
stop.append('in')
stop.append('to')
stop.append('km')
stop.append('the')
stop.append('in')
stop.append('of')
#We're appedning words we had seen earlier and would like to get rid of
stemmer = PorterStemmer()
lemma= WordNetLemmatizer()
# Stem each word.
analyzer = CountVectorizer().build_analyzer()
def stemmed_words(doc):
return (lemma.lemmatize(w) for w in analyzer(doc))
from sklearn.ensemble import RandomForestRegressor
tfid_vect=TfidfVectorizer(lowercase=True,stop_words=stop,analyzer=stemmed_words,min_df=3,ngram_range=(1,3))
rf_reg=RandomForestRegressor()
X_train_dtm = tfid_vect.fit_transform(X_train.text_all)
print(('Features: ', X_train_dtm.shape[1]))
X_test_dtm = tfid_vect.transform(X_test.text_all)
rf_reg = RandomForestRegressor(n_estimators=100)
rf_reg.fit(X_train_dtm, y_train)
y_pred_class = rf_reg.predict(X_test_dtm)
print(('Accuracy: ', r2_score(y_test, y_pred_class)))
#(tfid_vect.get_feature_names())
from sklearn.metrics import r2_score
feature_importance =pd.DataFrame({'feature':tfid_vect.get_feature_names(), 'importance':rf_reg.feature_importances_}).sort_values(by='importance',ascending=False)
feature_importance[:20].plot('feature','importance',kind='barh')
#pd.DataFrame(tfid_vect.fit_transform(X_test).toarray(), columns=tfid_vect.get_feature_names())
import scipy as sp
extra =sp.sparse.csr_matrix(X_train.drop('text_all',axis=1)).astype('float')
extra.shape
X_train_dtm_extra = sp.sparse.hstack((X_train_dtm, extra))
X_train_dtm_extra.shape
extra = sp.sparse.csr_matrix(X_test.drop('text_all', axis=1).astype(float))
X_test_dtm_extra = sp.sparse.hstack((X_test_dtm, extra))
X_test_dtm_extra.shape
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
# r2_list=[]
# mse_list=[]
# parameter_tuning= list(range(1, 9131))
# for param in parameter_tuning:
# params = {'max_features': 9130, 'min_samples_split': 2,
# 'learning_rate': 0.01, 'loss': 'ls','max_depth':12}
# clf = GradientBoostingRegressor(**params)
# clf.fit(X_train_dtm_extra,y_train)
# mse_list.append(mean_squared_error(y_test, clf.predict(X_test_dtm_extra)))
# r2_list.append(r2_score(y_test,clf.predict(X_test_dtm_extra)))
params = {'max_features': 7600, 'min_samples_split': 2,
'learning_rate': 0.01, 'loss': 'ls','max_depth':12, 'n_estimators':500}
clf = GradientBoostingRegressor(**params)
%timeit clf.fit(X_train_dtm_extra,y_train)
print("MSE: ",mean_squared_error(y_test, clf.predict(X_test_dtm_extra)))
print("R2 Score: ",r2_score(y_test,clf.predict(X_test_dtm_extra)))
# #depth =9, max features
# r2_list=[]
# mse_list=[]
# parameter_tuning= list(range(7500, 13224,100))
# for param in parameter_tuning:
# params = {'max_features': param, 'min_samples_split': 2,
# 'learning_rate': 0.01, 'loss': 'ls','max_depth':12}
# clf = GradientBoostingRegressor(**params)
# clf.fit(X_train_dtm_extra,y_train)
# mse_list.append(mean_squared_error(y_test, clf.predict(X_test_dtm_extra)))
# r2_list.append(r2_score(y_test,clf.predict(X_test_dtm_extra)))
# import timeit
# timeit
len(r2_list)
len(parameter_tuning[:27])
mpl.pyplot.plot(parameter_tuning[:27],r2_list,label="R2 Score");
mpl.pyplot.plot(parameter_tuning[:27],mse_list,label="MSE Score");
mpl.pyplot.legend();
sorted(zip(r2_list, parameter_tuning[:27]))[-1]
test_score = np.zeros((params['n_estimators'],), dtype=np.float64)
for i, y_pred in enumerate(clf.staged_predict(X_test_dtm_extra)):
test_score[i] = clf.loss_(y_test, y_pred)
mpl.pyplot.figure(figsize=(12, 6))
mpl.pyplot.subplot(1, 2, 1)
mpl.pyplot.title('Deviance')
mpl.pyplot.plot(np.arange(params['n_estimators']) + 1, clf.train_score_, 'b-',
label='Training Set Deviance')
mpl.pyplot.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',
label='Test Set Deviance')
mpl.pyplot.legend(loc='upper right')
mpl.pyplot.xlabel('Boosting Iterations')
mpl.pyplot.ylabel('Deviance')
attempt_data = {"Model_Iter":[1,2,3,4,5,6,7,8],"R2_Score":[0.10,0.26,0.280380019399908,0.2831359818817798,0.3522388190740968,0.3278027497723598,0.31852965100776354,0.4497024717780622],"Model Complexity":[17,17,55,55,9549,13244,13224,13224],"Model":['Linear','Linear','DecisionTree','DecisionTree','NLP- Random Forest','GradientBoostingRF','GradientBoostingRF','GradientBoostingRF']}
attempts_pd = pd.DataFrame(attempt_data)
attempts_pd["size"]=0.6*attempts_pd["Model Complexity"]
#modeldevelopment gif code below:
for i in range(8):
mpl.pyplot.xlabel("Modelling Attempts")
mpl.pyplot.ylabel("R2_score")
mpl.pyplot.title("Model R2_score: "+str(i) )
filename=str(i)+".png"
mpl.pyplot.scatter(attempts_pd.loc[i].Model_Iter,attempts_pd.loc[i].R2_Score,s=attempts_pd.loc[i]["size"])
mpl.pyplot.savefig(filename, dpi=96);
import glob
import os
gif_name = 'outputName'
file_list = glob.glob('*.png') # Get all the pngs in the current directory
#list.sort(file_list, key=lambda x: int(x.split('_')[1].split('.png')[0])) # Sort the images by #, this may need to be tweaked for your use case
with open('image_list.txt', 'w') as file:
for item in file_list:
file.write("%s\n" % item)
os.system('convert @image_list.txt {}.gif'.format(gif_name))
import glob
import moviepy.editor as mpy
gif_name = 'outputName2'
fps = 0.5
file_list = glob.glob('*.png') # Get all the pngs in the current directory
#list.sort(file_list, key=lambda x: int(x.split('_')[1].split('.png')[0])) # Sort the images by #, this may need to be tweaked for your use case
clip = mpy.ImageSequenceClip(file_list, fps=fps)
clip.write_gif('{}.gif'.format(gif_name), fps=fps)
import requests
import urllib
import re
from bs4 import BeautifulSoup
#from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
url_ex='https://www.airbnb.co.uk/rooms/40882994?source_impression_id=p3_1582740281_d51OJ4DhquMGYfGJ'
example1= requests.get(url_ex)
soup =BeautifulSoup(example1.content,'html.parser')
name=(soup.find(id='summary').find('span',class_= '_18hrqvin')).text
listing_info = (soup.find_all(class_='_36rlri',style="margin-right:24px"))
guests=int(listing_info[0].text.split(" guests")[0])
property_type=listing_info[1].text.split(" beds")[0]
beds_no = int(listing_info[2].text.split(" bed")[0])
#superhost - default is 1
summary_listing=soup.find(id='details').find('span',class_= '_czm8crp').text
review_score_overall=float(soup.find(id='reviews').find('div',class_= '_1iv05u9z').text[:4])
review_score_loc=float((soup.find(id='reviews').find_all(class_='_1p3joamp'))[0].text)
review_score_clean=float((soup.find(id='reviews').find_all(class_='_1p3joamp'))[4].text)
zone_dummy=mapping.get('Barnet')
neighbourhood_ov=str(input("Neighboughood Overview here: "))
review_no= soup.find(id='reviews').find('span',class_= '_krjbj').text[:4]
selector=widgets.SelectMultiple(
options=['Laptop-friendly','First aid kit','Fire extinguisher','Wifi', 'Dryer', 'Smoke Detector','Essentials','Heating','Iron','TV','Washing Machine','Kitchen','Hair dryer','Hangers','Carbon monoxide alarm','Air conditioning','Private entrance','shampoo','Hot water',
'Breakfast'],
value=['Wifi'],
rows=15,
description='Fruits',
disabled=False
)
display(selector)
amenities= set()
amenities.add(selector.value)
print("Name: "+name)
print("Guests: "+str(guests))
print("Property Type: "+ str(property_type))
print("Beds No: " + str(beds_no))
print(summary_listing[:150])
print(review_score_overall)
print(review_score_loc)
print(review_score_clean)
print(amenities)
fieldsa={'beds', 'number_of_reviews','review_scores_rating','review_scores_cleanliness','review_scores_location',
"require_guest_profile_picture_require_guest_profile_picture true", 'instant_bookable_instant_bookable true','host_is_superhost_host_is_superhost true', 'host_has_profile_pic_host_has_profile_pic true',
'reviews_per_month','Fare_Zone_Rough_Zone3', 'Fare_Zone_Rough_Zone4',
'Fare_Zone_Rough_Zone5', 'property_type_Apartment',
'property_type_Barn', 'property_type_Bed and breakfast',
'property_type_Boat', 'property_type_Boutique hotel',
'property_type_Bungalow', 'property_type_Cabin',
'property_type_Camper/RV', 'property_type_Condominium',
'property_type_Cottage', 'property_type_Earth house', 'property_type_Guest suite',
'property_type_Guesthouse', 'property_type_Hostel',
'property_type_Hotel', 'property_type_House', 'property_type_Houseboat',
'property_type_Hut', 'property_type_Lighthouse',
'property_type_Loft', 'property_type_Other',
'property_type_Serviced apartment', 'property_type_Tiny house', 'property_type_Treehouse',
'property_type_Villa',
'room_type_Private room',
'room_type_Shared room'}
new_input={}
for col in fieldsa:
new_input[col]=np.nan
def update_new_input(new_input):
new_input['beds']=beds_no
new_input['number_of_reviews']=review_no
new_input['review_scores_rating']=review_score_overall
new_input['review_scores_cleanliness']:review_score_clean
new_input["require_guest_profile_picture_require_guest_profile_picture true"]=1
new_input['instant_bookable_instant_bookable true']=1
new_input['host_is_superhost_host_is_superhost true']=0
new_input['host_has_profile_pic_host_has_profile_pic true']=1,
new_input['name']=name
new_input['amenities']=amenities
new_input['summary']=summary_listing
new_input['review_scores_location']=review_score_loc
#Update as needed using these
property_type=listing_info[1].text.split(" beds")[0]
zone_dummy=mapping.get(input('Type borough here '))
neighbourhood_ov=str(input('Type neighbourhood overview here '))
new_input['neighborhood_overview']=neighbourhood_ov
text_string= name+ " " + summary_listing +" " + str(amenities) + " "+ neighbourhood_ov
new_input['text_all']=text_string;
zone_dummy
X_test=X_test.append(new_input,ignore_index=True);
y_test= y_test.append(np.log1p(input("Enter Price Here")))
X_test_trial=X_test.loc[X_test.shape[0]-1].to_frame().T
X_test_dtm_trialed = tfid_vect.transform(X_test_trial.text_all)
pd.DataFrame(tfid_vect.transform(X_test_trial).toarray(), columns=tfid_vect.get_feature_names()).max(axis=0).sort_values(ascending=False)[:20]
X_test_dtm_trialed
trial_xtra=sp.sparse.csr_matrix(X_test_trial.drop('text_all',axis=1)).astype(np.float)
trial_xtra.shape
y_pred_class_trialed = rf_reg.predict(X_test_dtm_trialed)
#print(('Accuracy: ', r2_score(y_test, y_pred_class)))